Dependencies
library(tidyverse)
## ── Attaching packages ──────────────────────────────────────────────────────────────── tidyverse 1.2.1 ──
## ✔ ggplot2 3.1.0 ✔ purrr 0.2.5
## ✔ tibble 1.4.2 ✔ dplyr 0.7.8
## ✔ tidyr 0.8.2 ✔ stringr 1.3.1
## ✔ readr 1.3.0 ✔ forcats 0.3.0
## ── Conflicts ─────────────────────────────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(plotly)
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
library(dplyr)
library(klaR)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:plotly':
##
## select
## The following object is masked from 'package:dplyr':
##
## select
data.tb <- read_csv("./data/BlackFriday.csv")
## Parsed with column specification:
## cols(
## User_ID = col_double(),
## Product_ID = col_character(),
## Gender = col_character(),
## Age = col_character(),
## Occupation = col_double(),
## City_Category = col_character(),
## Stay_In_Current_City_Years = col_character(),
## Marital_Status = col_double(),
## Product_Category_1 = col_double(),
## Product_Category_2 = col_double(),
## Product_Category_3 = col_double(),
## Purchase = col_double()
## )
data.tb %>% head(25)
## # A tibble: 25 x 12
## User_ID Product_ID Gender Age Occupation City_Category
## <dbl> <chr> <chr> <chr> <dbl> <chr>
## 1 1000001 P00069042 F 0-17 10 A
## 2 1000001 P00248942 F 0-17 10 A
## 3 1000001 P00087842 F 0-17 10 A
## 4 1000001 P00085442 F 0-17 10 A
## 5 1000002 P00285442 M 55+ 16 C
## 6 1000003 P00193542 M 26-35 15 A
## 7 1000004 P00184942 M 46-50 7 B
## 8 1000004 P00346142 M 46-50 7 B
## 9 1000004 P0097242 M 46-50 7 B
## 10 1000005 P00274942 M 26-35 20 A
## # ... with 15 more rows, and 6 more variables:
## # Stay_In_Current_City_Years <chr>, Marital_Status <dbl>,
## # Product_Category_1 <dbl>, Product_Category_2 <dbl>,
## # Product_Category_3 <dbl>, Purchase <dbl>
test.tb <- data.tb %>% head(2000)
find unique values for Age
unique(test.tb$Age)
## [1] "0-17" "55+" "26-35" "46-50" "51-55" "36-45" "18-25"
functions to filter Ages into return values (numeric representations of the group)
ageFilter <- function(age) {
switch(age, "0-17"=1, "18-25"=2, "26-35"=3, "36-45"=4, "46-50"=5, "51-55"=6, "55+"=7)
}
ageFilter("51-55")
## [1] 6
pip Age vector into ageFilter(), append to tibble
##data.tb$age <- ageFilter(data.tb$Age)
test.tb$age <- test.tb$Age
#data.tb %>% head()
for (i in 1:length(test.tb$age)) {
test.tb$age[i] <- ageFilter(test.tb$age[i])
##print(test.tb$Age + " " + test.tb$age)
}
Regression: Age vs Purchase Value
fit <- lm(formula= test.tb$Purchase ~ test.tb$age + 0, data=test.tb)
fit[1]
## $coefficients
## test.tb$age1 test.tb$age2 test.tb$age3 test.tb$age4 test.tb$age5
## 9968.213 9256.855 9785.878 9799.988 9218.589
## test.tb$age6 test.tb$age7
## 8482.071 8192.040
Correlation: Age vs Purchase Value
cor(test.tb$age %>% as.numeric(), test.tb$Purchase %>% as.numeric())
## [1] -0.04984716
test_na.tb <- na.omit(test.tb)
plot <- plot_ly(
x = c(test_na.tb$Age),
y = c(test_na.tb$Purchase),
name = "Age vs. Purchase Amount",
type = "bar"
)
plot